In [136]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from wordcloud import WordCloud
from matplotlib.colors import LinearSegmentedColormap

%matplotlib inline
In [93]:
Xnum_trunc = pickle.load(open('Xnum_trunc', 'rb'))
w = pickle.load(open('Xnum_w', 'rb'))
y = pickle.load(open('Xnum_y', 'rb'))
X_tsne0 =  pickle.load(open('X3_bow_ts0', 'rb'))
y_0 = pickle.load(open('y_kmeans0', 'rb'))
X_tsne1 =  pickle.load(open('X3_bow_ts1', 'rb'))
y_1 = pickle.load(open('y_kmeans1', 'rb'))
X_tsne2 =  pickle.load(open('X3_bow_ts2', 'rb'))
y_2 = pickle.load(open('y_kmeans2', 'rb'))
X_tsne3 =  pickle.load(open('X3_bow_ts3', 'rb'))
y_3 = pickle.load(open('y_kmeans3', 'rb'))
X_tsne4 =  pickle.load(open('X3_bow_ts4', 'rb'))
y_4 = pickle.load(open('y_kmeans4', 'rb'))
Out[93]:
2044
In [134]:
plt.figure(figsize=(15,9))
plt.suptitle('Posts Measures and Themes Clustering')
gs = gridspec.GridSpec(3, 5)
ax1 = plt.subplot(gs[:,:-2])
ax1.scatter(Xnum_trunc[:,0], -Xnum_trunc[:,1], alpha=0.8, c=y);
ax1.set_axis_off(); 
for feat, feat_name in zip(w, ['Score(log)','ViewCount(log)','AnswerCount','CommentCount','FavoriteCount']):
    ax1.arrow(0.2, -0.4, .5*feat[0], .5*feat[1], color='k', width=0.003, ec='none')
    if not feat_name.endswith('(log)'):
        ax1.text(0.22+.5*feat[0], -0.38+.5*feat[1], feat_name, ha='center', color='k')
    else:
        ax1.text(0.3+.5*feat[0], -0.4+.5*feat[1], feat_name, ha='center', color='k')
ax2 = plt.subplot(gs[0,3])
ax2.scatter(X_tsne0[:,0], X_tsne0[:,1], alpha=0.8, c=y_0, cmap='Purples');
ax2.annotate(f'{len(y_0)*100/len(y):.2f}%', xy=(0,0.9), xycoords='axes fraction')
ax2.set_title('Mod Views, High Eng')
ax2.set_axis_off(); 
ax3 = plt.subplot(gs[0,4])
ax3.scatter(X_tsne1[:,0], X_tsne1[:,1], alpha=0.8, c=y_1, cmap='Blues');
ax3.annotate(f'{len(y_1)*100/len(y):.2f}%', xy=(0,0.9), xycoords='axes fraction')
ax3.set_title('Mod Views, Low Eng')
ax3.set_axis_off(); 
ax4 = plt.subplot(gs[1,3])
ax4.scatter(X_tsne2[:,0], X_tsne2[:,1], alpha=0.8, c=y_2, cmap='BuGn');
ax4.annotate(f'{len(y_2)*100/len(y):.2f}%', xy=(0,0.9), xycoords='axes fraction')
ax4.set_title('Low Views, Low Eng')
ax4.set_axis_off(); 
ax5 = plt.subplot(gs[1,4])
ax5.scatter(X_tsne3[:,0], X_tsne3[:,1], alpha=0.8, c=y_3, cmap='Greens');
ax5.annotate(f'{len(y_3)*100/len(y):.2f}%', xy=(0,0.9), xycoords='axes fraction')
ax5.set_title('Low Views, High Eng')
ax5.set_axis_off(); 
ax6 = plt.subplot(gs[2,3])
ax6.scatter(X_tsne4[:,0], X_tsne4[:,1], alpha=0.8, c=y_4, cmap='YlOrRd');
ax6.annotate(f'{len(y_4)*100/len(y):.2f}%', xy=(0,0.9), xycoords='axes fraction')
ax6.set_title('High Views, High Eng')
ax6.set_axis_off(); 
In [138]:
wordcloud0 =  pickle.load(open('wordcloud0', 'rb'))
wordcloud1 =  pickle.load(open('wordcloud1', 'rb'))
wordcloud2 =  pickle.load(open('wordcloud2', 'rb'))
wordcloud3 =  pickle.load(open('wordcloud3', 'rb'))
wordcloud4 =  pickle.load(open('wordcloud4', 'rb'))
In [141]:
fig, ax = plt.subplots(10, 6, figsize=(15,25), sharey=False, dpi=500)
plt.subplots_adjust(wspace=0, hspace=0)
for i in range(10):
    ax[i,0].set_axis_off();
    for j in range(1,6):
        ax[i,j].spines['top'].set_color('Grey')
        ax[i,j].spines['left'].set_color('Grey')
        ax[i,j].spines['bottom'].set_color('Grey')
        ax[i,j].spines['right'].set_color('Grey')
        ax[i,j].set_xticks([])
        ax[i,j].set_yticks([])
ax[0,1].set_title('$Hot$ $Posts$\nHi View, Hi Eng');
ax[0,2].set_title('$Trending$\nMod View, Hi Eng');
ax[0,3].set_title('$S.O.S$\nMod View, Low Eng');
ax[0,4].set_title('$Curious$ $Topics$\nLow View, Hi Eng');
ax[0,5].set_title('$Spam$\nLow View, Low Eng');
ax[0,1].imshow(wordcloud4[0], aspect='auto');
ax[9,1].imshow(wordcloud4[1], aspect='auto');
ax[1,1].imshow(wordcloud4[2], aspect='auto');
ax[5,1].imshow(wordcloud4[3], aspect='auto');
ax[6,1].imshow(wordcloud4[4], aspect='auto');
cmap_g = LinearSegmentedColormap.from_list('mycmap', ['#11644D', '#A0B046'])
ax[3,4].imshow(wordcloud3[0].recolor(colormap=cmap_g), aspect='auto');
ax[2,4].imshow(wordcloud3[1].recolor(colormap=cmap_g), aspect='auto');
ax[9,4].imshow(wordcloud3[2].recolor(colormap=cmap_g), aspect='auto');
ax[5,4].imshow(wordcloud3[3].recolor(colormap=cmap_g), aspect='auto');
ax[1,4].imshow(wordcloud3[4].recolor(colormap=cmap_g), aspect='auto');
ax[4,4].imshow(wordcloud3[5].recolor(colormap=cmap_g), aspect='auto');
cmap_p = LinearSegmentedColormap.from_list('mycmap', ['#DDA0DD', '#4B0082'])
cmap_b = LinearSegmentedColormap.from_list('mycmap', ['#107FC9', '#0B108C'])
cmap_bg = LinearSegmentedColormap.from_list('mycmap', ['#20B2AA', '#008080'])
ax[0,2].imshow(wordcloud0[0].recolor(colormap=cmap_p), aspect='auto');
ax[5,2].imshow(wordcloud0[1].recolor(colormap=cmap_p), aspect='auto');
ax[1,2].imshow(wordcloud0[2].recolor(colormap=cmap_p), aspect='auto');
ax[6,2].imshow(wordcloud0[3].recolor(colormap=cmap_p), aspect='auto');
ax[9,2].imshow(wordcloud0[4].recolor(colormap=cmap_p), aspect='auto');
ax[2,2].imshow(wordcloud0[5].recolor(colormap=cmap_p), aspect='auto');
ax[2,3].imshow(wordcloud1[0].recolor(colormap=cmap_b), aspect='auto');
ax[7,3].imshow(wordcloud1[1].recolor(colormap=cmap_b), aspect='auto');
ax[5,3].imshow(wordcloud1[2].recolor(colormap=cmap_b), aspect='auto');
ax[9,3].imshow(wordcloud1[3].recolor(colormap=cmap_b), aspect='auto');
ax[8,3].imshow(wordcloud1[4].recolor(colormap=cmap_b), aspect='auto');
ax[0,3].imshow(wordcloud1[5].recolor(colormap=cmap_b), aspect='auto');
ax[1,3].imshow(wordcloud1[6].recolor(colormap=cmap_b), aspect='auto');
ax[6,5].imshow(wordcloud2[0].recolor(colormap=cmap_bg), aspect='auto');
ax[0,5].imshow(wordcloud2[1].recolor(colormap=cmap_bg), aspect='auto');
ax[9,5].imshow(wordcloud2[2].recolor(colormap=cmap_bg), aspect='auto');
ax[5,5].imshow(wordcloud2[3].recolor(colormap=cmap_bg), aspect='auto');
ax[3,5].imshow(wordcloud2[4].recolor(colormap=cmap_bg), aspect='auto');
ax[0,0].annotate('$Pandas$', xy=(0.3, 0.5), xycoords='axes fraction', fontsize=12)
ax[1,0].annotate('$Numpy$', xy=(0.3, 0.5), xycoords='axes fraction', fontsize=12)
ax[2,0].annotate('$String$\n$Processing$', xy=(0.3, 0.5), xycoords='axes fraction', fontsize=12)
ax[3,0].annotate('$Lists$', xy=(0.3, 0.5), xycoords='axes fraction', fontsize=12)
ax[4,0].annotate('$Classes$', xy=(0.3, 0.5), xycoords='axes fraction', fontsize=12)
ax[5,0].annotate('$Django$', xy=(0.3, 0.5), xycoords='axes fraction', fontsize=12)
ax[6,0].annotate('$Matplotlib$', xy=(0.3, 0.5), xycoords='axes fraction', fontsize=12)
ax[7,0].annotate('$Web$\n$Interfaces$', xy=(0.3, 0.5), xycoords='axes fraction', fontsize=12)
ax[8,0].annotate('$Web$\n$IEngines$', xy=(0.3, 0.5), xycoords='axes fraction', fontsize=12)
ax[9,0].annotate('$Mix$', xy=(0.3, 0.5), xycoords='axes fraction', fontsize=12)
Out[141]:
Text(0.3, 0.5, '$Mix$')
In [108]:
list(np.unique(y_4, return_counts=True)[1])
Out[108]:
[24, 263, 21, 25, 9]
In [110]:
help(wordcloud0[5])
Help on WordCloud in module wordcloud.wordcloud object:

class WordCloud(builtins.object)
 |  Word cloud object for generating and drawing.
 |  
 |  Parameters
 |  ----------
 |  font_path : string
 |      Font path to the font that will be used (OTF or TTF).
 |      Defaults to DroidSansMono path on a Linux machine. If you are on
 |      another OS or don't have this font, you need to adjust this path.
 |  
 |  width : int (default=400)
 |      Width of the canvas.
 |  
 |  height : int (default=200)
 |      Height of the canvas.
 |  
 |  prefer_horizontal : float (default=0.90)
 |      The ratio of times to try horizontal fitting as opposed to vertical.
 |      If prefer_horizontal < 1, the algorithm will try rotating the word
 |      if it doesn't fit. (There is currently no built-in way to get only
 |      vertical words.)
 |  
 |  mask : nd-array or None (default=None)
 |      If not None, gives a binary mask on where to draw words. If mask is not
 |      None, width and height will be ignored and the shape of mask will be
 |      used instead. All white (#FF or #FFFFFF) entries will be considerd
 |      "masked out" while other entries will be free to draw on. [This
 |      changed in the most recent version!]
 |  
 |  contour_width: float (default=0)
 |      If mask is not None and contour_width > 0, draw the mask contour.
 |  
 |  contour_color: color value (default="black")
 |      Mask contour color.
 |  
 |  scale : float (default=1)
 |      Scaling between computation and drawing. For large word-cloud images,
 |      using scale instead of larger canvas size is significantly faster, but
 |      might lead to a coarser fit for the words.
 |  
 |  min_font_size : int (default=4)
 |      Smallest font size to use. Will stop when there is no more room in this
 |      size.
 |  
 |  font_step : int (default=1)
 |      Step size for the font. font_step > 1 might speed up computation but
 |      give a worse fit.
 |  
 |  max_words : number (default=200)
 |      The maximum number of words.
 |  
 |  stopwords : set of strings or None
 |      The words that will be eliminated. If None, the build-in STOPWORDS
 |      list will be used. Ignored if using generate_from_frequencies.
 |  
 |  background_color : color value (default="black")
 |      Background color for the word cloud image.
 |  
 |  max_font_size : int or None (default=None)
 |      Maximum font size for the largest word. If None, height of the image is
 |      used.
 |  
 |  mode : string (default="RGB")
 |      Transparent background will be generated when mode is "RGBA" and
 |      background_color is None.
 |  
 |  relative_scaling : float (default='auto')
 |      Importance of relative word frequencies for font-size.  With
 |      relative_scaling=0, only word-ranks are considered.  With
 |      relative_scaling=1, a word that is twice as frequent will have twice
 |      the size.  If you want to consider the word frequencies and not only
 |      their rank, relative_scaling around .5 often looks good.
 |      If 'auto' it will be set to 0.5 unless repeat is true, in which
 |      case it will be set to 0.
 |  
 |      .. versionchanged: 2.0
 |          Default is now 'auto'.
 |  
 |  color_func : callable, default=None
 |      Callable with parameters word, font_size, position, orientation,
 |      font_path, random_state that returns a PIL color for each word.
 |      Overwrites "colormap".
 |      See colormap for specifying a matplotlib colormap instead.
 |      To create a word cloud with a single color, use
 |      ``color_func=lambda *args, **kwargs: "white"``.
 |      The single color can also be specified using RGB code. For example
 |      ``color_func=lambda *args, **kwargs: (255,0,0)`` sets color to red.
 |  
 |  regexp : string or None (optional)
 |      Regular expression to split the input text into tokens in process_text.
 |      If None is specified, ``r"\w[\w']+"`` is used. Ignored if using
 |      generate_from_frequencies.
 |  
 |  collocations : bool, default=True
 |      Whether to include collocations (bigrams) of two words. Ignored if using
 |      generate_from_frequencies.
 |  
 |  
 |      .. versionadded: 2.0
 |  
 |  colormap : string or matplotlib colormap, default="viridis"
 |      Matplotlib colormap to randomly draw colors from for each word.
 |      Ignored if "color_func" is specified.
 |  
 |      .. versionadded: 2.0
 |  
 |  normalize_plurals : bool, default=True
 |      Whether to remove trailing 's' from words. If True and a word
 |      appears with and without a trailing 's', the one with trailing 's'
 |      is removed and its counts are added to the version without
 |      trailing 's' -- unless the word ends with 'ss'. Ignored if using
 |      generate_from_frequencies.
 |  
 |  repeat : bool, default=False
 |      Whether to repeat words and phrases until max_words or min_font_size
 |      is reached.
 |  
 |  Attributes
 |  ----------
 |  ``words_`` : dict of string to float
 |      Word tokens with associated frequency.
 |  
 |      .. versionchanged: 2.0
 |          ``words_`` is now a dictionary
 |  
 |  ``layout_`` : list of tuples (string, int, (int, int), int, color))
 |      Encodes the fitted word cloud. Encodes for each word the string, font
 |      size, position, orientation and color.
 |  
 |  Notes
 |  -----
 |  Larger canvases with make the code significantly slower. If you need a
 |  large word cloud, try a lower canvas size, and set the scale parameter.
 |  
 |  The algorithm might give more weight to the ranking of the words
 |  than their actual frequencies, depending on the ``max_font_size`` and the
 |  scaling heuristic.
 |  
 |  Methods defined here:
 |  
 |  __array__(self)
 |      Convert to numpy array.
 |      
 |      Returns
 |      -------
 |      image : nd-array size (width, height, 3)
 |          Word cloud image as numpy matrix.
 |  
 |  __init__(self, font_path=None, width=400, height=200, margin=2, ranks_only=None, prefer_horizontal=0.9, mask=None, scale=1, color_func=None, max_words=200, min_font_size=4, stopwords=None, random_state=None, background_color='black', max_font_size=None, font_step=1, mode='RGB', relative_scaling='auto', regexp=None, collocations=True, colormap=None, normalize_plurals=True, contour_width=0, contour_color='black', repeat=False)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  fit_words(self, frequencies)
 |      Create a word_cloud from words and frequencies.
 |      
 |      Alias to generate_from_frequencies.
 |      
 |      Parameters
 |      ----------
 |      frequencies : dict from string to float
 |          A contains words and associated frequency.
 |      
 |      Returns
 |      -------
 |      self
 |  
 |  generate(self, text)
 |      Generate wordcloud from text.
 |      
 |      The input "text" is expected to be a natural text. If you pass a sorted
 |      list of words, words will appear in your output twice. To remove this
 |      duplication, set ``collocations=False``.
 |      
 |      Alias to generate_from_text.
 |      
 |      Calls process_text and generate_from_frequencies.
 |      
 |      Returns
 |      -------
 |      self
 |  
 |  generate_from_frequencies(self, frequencies, max_font_size=None)
 |      Create a word_cloud from words and frequencies.
 |      
 |      Parameters
 |      ----------
 |      frequencies : dict from string to float
 |          A contains words and associated frequency.
 |      
 |      max_font_size : int
 |          Use this font-size instead of self.max_font_size
 |      
 |      Returns
 |      -------
 |      self
 |  
 |  generate_from_text(self, text)
 |      Generate wordcloud from text.
 |      
 |      The input "text" is expected to be a natural text. If you pass a sorted
 |      list of words, words will appear in your output twice. To remove this
 |      duplication, set ``collocations=False``.
 |      
 |      Calls process_text and generate_from_frequencies.
 |      
 |      ..versionchanged:: 1.2.2
 |          Argument of generate_from_frequencies() is not return of
 |          process_text() any more.
 |      
 |      Returns
 |      -------
 |      self
 |  
 |  process_text(self, text)
 |      Splits a long text into words, eliminates the stopwords.
 |      
 |      Parameters
 |      ----------
 |      text : string
 |          The text to be processed.
 |      
 |      Returns
 |      -------
 |      words : dict (string, int)
 |          Word tokens with associated frequency.
 |      
 |      ..versionchanged:: 1.2.2
 |          Changed return type from list of tuples to dict.
 |      
 |      Notes
 |      -----
 |      There are better ways to do word tokenization, but I don't want to
 |      include all those things.
 |  
 |  recolor(self, random_state=None, color_func=None, colormap=None)
 |      Recolor existing layout.
 |      
 |      Applying a new coloring is much faster than generating the whole
 |      wordcloud.
 |      
 |      Parameters
 |      ----------
 |      random_state : RandomState, int, or None, default=None
 |          If not None, a fixed random state is used. If an int is given, this
 |          is used as seed for a random.Random state.
 |      
 |      color_func : function or None, default=None
 |          Function to generate new color from word count, font size, position
 |          and orientation.  If None, self.color_func is used.
 |      
 |      colormap : string or matplotlib colormap, default=None
 |          Use this colormap to generate new colors. Ignored if color_func
 |          is specified. If None, self.color_func (or self.color_map) is used.
 |      
 |      Returns
 |      -------
 |      self
 |  
 |  to_array(self)
 |      Convert to numpy array.
 |      
 |      Returns
 |      -------
 |      image : nd-array size (width, height, 3)
 |          Word cloud image as numpy matrix.
 |  
 |  to_file(self, filename)
 |      Export to image file.
 |      
 |      Parameters
 |      ----------
 |      filename : string
 |          Location to write to.
 |      
 |      Returns
 |      -------
 |      self
 |  
 |  to_html(self)
 |  
 |  to_image(self)
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)